import os
import gseapy as gp
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from rapidfuzz import process
import mygene
from scipy.cluster.hierarchy import linkage, dendrogram, fcluster
import requests
Drug-»Disease dataset preprocessing¶
Messed some things up with "cell_id", pirate ship is sinking, saving the gold
# Standardize cell IDs everywhere
expr_matrix.index = expr_matrix.index.astype(str)
if meta_df.index.name != 'cell_id':
meta_df = meta_df.reset_index()
if 'cell_id' not in meta_df.columns:
meta_df['cell_id'] = meta_df.index.astype(str)
meta_df['cell_id'] = meta_df['cell_id'].astype(str).str.strip()
# Align metadata
meta_df = meta_df.set_index('cell_id')
assert set(expr_matrix.index) == set(meta_df.index), "Mismatch in cell IDs!"
# Then create AnnData
import scanpy as sc
adata = sc.AnnData(expr_matrix.astype(float))
adata.obs = meta_df.loc[expr_matrix.index]
adata.obs.index = adata.obs.index.astype(str)
adata.obsm["spatial"] = adata.obs[["center_x", "center_y"]].to_numpy()
adata.obs['cell_id'] = adata.obs.index
#reading the dataset from a csv file
#this csv file is a dataset about drug-target information containting 54851 records and 10 attributes
#key parts of this initial dataset is the list of diseases, corresponding drugs effective againts them and the interaction scores showing how effective is the drug against this disease
#downloaded from http://zhanglab.hzau.edu.cn/GETdb/page/download.jsp
DrugTargetInfo = pd.read_csv("/home/alexandrarolya/Pablo/GETdb_drugTargetInfo.csv")
print(DrugTargetInfo.shape)
print(DrugTargetInfo.head())
(54851, 10)
drug_name drug_type \
0 zinc acetate small molecule
1 copper small molecule
2 zinc small molecule
3 zinc chloride small molecule
4 ocriplasmin biotech
drug_MeSHID \
0 Growth Disorders[MeSHID:D006130],Diarrhea[MeSH...
1 Disruptive, Impulse Control, and Conduct Disor...
2 Growth Disorders[MeSHID:D006130],Diarrhea[MeSH...
3 Serum[MeSHID:D044967],Syndrome[MeSHID:D013577]...
4 Tissue Adhesions[MeSHID:D000267],Disruptive, I...
drug_status target_name gene_name target_type \
0 approved,investigational alpha-1b-glycoprotein A1BG NaN
1 approved,investigational alpha-1b-glycoprotein A1BG NaN
2 approved,investigational alpha-1b-glycoprotein A1BG NaN
3 approved,investigational alpha-2-macroglobulin A2M NaN
4 approved alpha-2-macroglobulin A2M NaN
target_action interaction_score source
0 unknown NaN drugbank
1 unknown NaN drugbank
2 unknown NaN drugbank
3 ligand NaN drugbank
4 ligand NaN drugbank
Initial filtering¶
# Keep only rows with non-null interaction scores
dtf = DrugTargetInfo[DrugTargetInfo['interaction_score'].notna()].copy()
print(f"Rows with valid interaction score: {dtf.shape[0]}")
# Standartizing gene names
dtf['gene_name'] = dtf['gene_name'].str.upper()
#extracting breast cancer related genes
bc_dtf = dtf[dtf['drug_MeSHID'].str.contains('breast', case=False, na=False)]
print(f"Breast cancer–related drug-target pairs: {bc_dtf.shape[0]}")
print(bc_dtf[['drug_name', 'gene_name', 'interaction_score', 'drug_MeSHID']].head())
Rows with valid interaction score: 10543
Breast cancer–related drug-target pairs: 371
drug_name gene_name interaction_score \
81 tariquidar ABCB1 0.66
84 tesmilifene ABCB1 0.11
86 tariquidar ABCB1 0.66
94 tariquidar ABCB1 0.66
95 tariquidar ABCB1 0.66
drug_MeSHID
81 Malignant neoplasm of breast[MeSHID:D001943],M...
84 Malignant Neoplasms[MeSHID:D009369],Disruptive...
86 Malignant neoplasm of breast[MeSHID:D001943],M...
94 Malignant neoplasm of breast[MeSHID:D001943],M...
95 Malignant neoplasm of breast[MeSHID:D001943],M...
# Reset index to start from 1 (optional: drop old index)
bc_dtf_reset = bc_dtf.reset_index(drop=True)
bc_dtf_reset.index += 1 # Start numbering rows at 1
# Save to CSV with the new index as a column in the saved file
bc_dtf_reset.to_csv("DrugName_BreastCancer.csv", index_label="RowNumber")
print("Saved filtered breast cancer drug-target data to 'DrugName_BreastCancer.csv'")
Saved filtered breast cancer drug-target data to 'DrugName_BreastCancer.csv'
Filtered dataset¶
Since the initial dataset was way too complex and showed too much data in one row and i have separated all the MeSH_id-s in seperate rows in order to then properly extract the diseases I am curious about
#Split the 'drug_MeSHID' string by comma into lists
bc_dtf['MeSH_list'] = bc_dtf['drug_MeSHID'].str.split(',')
#Explode the list of MeSH IDs into separate rows
bc_dtf_expanded = bc_dtf.explode('MeSH_list').reset_index(drop=True)
#Remove leading/trailing whitespace in the newly created 'MeSH_list' column
bc_dtf_expanded['MeSH_list'] = bc_dtf_expanded['MeSH_list'].str.strip()
#Drop the original 'drug_MeSHID' column
bc_dtf_expanded = bc_dtf_expanded.drop(columns=['drug_MeSHID'])
#Save this expanded dataframe—each MeSH ID in its own row
#Reset index starting from 1 as per your previous request
bc_dtf_expanded = bc_dtf_expanded.reset_index(drop=True)
bc_dtf_expanded.index += 1
# Rename index column for clarity
bc_dtf_expanded.to_csv("DrugName_BreastCancer_ExpandedMeSH.csv", index_label="RowNumber")
print(f"Expanded table saved with {bc_dtf_expanded.shape[0]} rows, one MeSH ID per row.")
Expanded table saved with 3521 rows, one MeSH ID per row.
/tmp/ipykernel_62984/3875924426.py:2: SettingWithCopyWarning:
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead
See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
bc_dtf['MeSH_list'] = bc_dtf['drug_MeSHID'].str.split(',')
Here i select the disease i want to dive into -» in this case it is a breast cancer
df = pd.read_csv("DrugName_BreastCancer_ExpandedMeSH.csv")
cols = ["RowNumber", "MeSH_list", "drug_name", "target_name", "gene_name", "interaction_score"]
df = df[cols]
df = df.rename(columns={
"RowNumber": "old_row_number",
"MeSH_list": "mesh_id", #other columns are unchanged
})
bc_only = df[df["mesh_id"].str.contains("breast", case=False, na=False)] #filtering for breast terms
bc_only = bc_only.reset_index(drop=True)
bc_only.index += 1
bc_only.to_csv("UPD_DrugTarget_BreastCancer.csv", index=False) #saving without index column
print(f"Saved breast cancer specific data with neat, consistent columns to UPD_DrugTarget_BreastCancer.csv")
print(f"Created table size: {bc_only.shape[0]} rows, {bc_only.shape[1]} columns")
print(bc_only.head())
Saved breast cancer specific data with neat, consistent columns to UPD_DrugTarget_BreastCancer.csv
Created table size: 458 rows, 6 columns
old_row_number mesh_id drug_name \
1 1 Malignant neoplasm of breast[MeSHID:D001943] tariquidar
2 12 Malignant neoplasm of breast[MeSHID:D001943] tesmilifene
3 13 Malignant neoplasm of breast[MeSHID:D001943] tariquidar
4 20 Malignant neoplasm of breast[MeSHID:D001943] tariquidar
5 27 Malignant neoplasm of breast[MeSHID:D001943] tariquidar
target_name gene_name interaction_score
1 multidrug resistance protein 1 ABCB1 0.66
2 p-glycoprotein 1 ABCB1 0.11
3 multidrug resistance protein 1 ABCB1 0.66
4 p-glycoprotein 1 ABCB1 0.66
5 p-glycoprotein 1 ABCB1 0.66
In order to be sure, that the extracted data is only regarding the breast cancer and not any type of breast related diseases I wish to see the MeSH terms, thus the extracted breast related diseases, and then maybe I will need to exclude some data if it is not cancer related.
df = pd.read_csv("UPD_DrugTarget_BreastCancer.csv")
# Get the unique mesh_term (or mesh_id) values
unique_mesh_ids = df['mesh_id'].unique()
print(f"Number of unique MeSH terms: {len(unique_mesh_ids)}")
print("List of unique MeSH terms (showing up to 100):")
for i, term in enumerate(unique_mesh_ids[:100]):
print(f"{i+1}. {term}")
Number of unique MeSH terms: 7 List of unique MeSH terms (showing up to 100): 1. Malignant neoplasm of breast[MeSHID:D001943] 2. Triple Negative Breast Neoplasms[MeSHID:D064726] 3. Breast Fibrocystic Disease[MeSHID:D005348] 4. Breast[MeSHID:D001940] 5. Breast Feeding[MeSHID:D001942] 6. Inflammatory Breast Carcinoma[MeSHID:D058922] 7. Breast Diseases[MeSHID:D001941]
As we see, only 3 of the diseases are actually solely brest cancer related; thus for further analysis, I am only selecting these 3 MeSH-terms.
df = pd.read_csv("UPD_DrugTarget_BreastCancer.csv")
cancer_terms = [
"Malignant neoplasm of breast[MeSHID:D001943]",
"Triple Negative Breast Neoplasms[MeSHID:D064726]",
"Inflammatory Breast Carcinoma[MeSHID:D058922]"
]
# Filter for strict MeSH terms
cancer_only = df[df["mesh_id"].isin(cancer_terms)].reset_index(drop=True)
cancer_only.index += 1
# Save STRICT file without index column (index=False)
cancer_only.to_csv("UPD_DrugTarget_BreastCancer_STRICT.csv", index=False)
print(f"Filtered dataset saved to UPD_DrugTarget_BreastCancer_STRICT.csv with size: {cancer_only.shape}")
print(cancer_only['mesh_id'].value_counts())
Filtered dataset saved to UPD_DrugTarget_BreastCancer_STRICT.csv with size: (363, 6) mesh_id Malignant neoplasm of breast[MeSHID:D001943] 340 Triple Negative Breast Neoplasms[MeSHID:D064726] 21 Inflammatory Breast Carcinoma[MeSHID:D058922] 2 Name: count, dtype: int64
As for now I have a csv file "UPD_DrugTarget_BreastCancer_STRICT.scv" which has the information about the type of disease (MeSH-id), the drug which is effective against this specific disease and the interaction score showing how effective is this drug against this disease.
#takes around 1-2 minutes
expr_matrix_raw = pd.read_csv("/share/ulli/HumanBreastCancerPatient1_cell_by_gene.csv")
# Extracting cell names and setzing as index
if 'cell' in expr_matrix_raw.columns:
metadata = expr_matrix_raw[['cell']].copy()
expr_matrix = expr_matrix_raw.drop(columns=['cell'])
expr_matrix.index = metadata['cell']
else:
expr_matrix = expr_matrix_raw
# Making all gene names uppercase for matching
expr_matrix.columns = expr_matrix.columns.str.upper()
Loading cleaned Drug-Disease dataset¶
- keeping "drug_name", "gene_name" and "interaction score"
- uppercase gene names
drug_target_df = pd.read_csv("UPD_DrugTarget_BreastCancer_STRICT.csv")
#I need only these columns: 'drug_name', 'gene_name', 'interaction_score'
drug_target_df['gene_name'] = drug_target_df['gene_name'].str.upper()
Scoring each drug per cell¶
- based on the weighted expression of drug target genes -» how strongly each cell expresses the targets of each drug
def score_drugs(expr_matrix, drug_targets):
"""
Calculate drug target expression scores for each cell.
Args:
expr_matrix: DataFrame with cells as rows and uppercase gene names as columns.
drug_targets: DataFrame with columns [drug_name, gene_name, interaction_score].
Returns:
DataFrame with drugs as rows, cells as columns, values are weighted target expression sums.
"""
drugs = drug_targets['drug_name'].unique()
cells = expr_matrix.index
# Initialize DataFrame for scores
drug_scores = pd.DataFrame(index=drugs, columns=cells, dtype=float)
# Build mapping to speed up: drug -> list of (gene, score)
drug_to_targets = drug_targets.groupby('drug_name').apply(
lambda g: list(zip(g['gene_name'], g['interaction_score']))
).to_dict()
for drug, targets in drug_to_targets.items():
genes, weights = zip(*targets)
genes = [g.upper() for g in genes]
# Filter genes present in expression matrix
valid_genes = [g for g in genes if g in expr_matrix.columns]
if not valid_genes:
# Skip drugs without target genes in expr_matrix
continue
# Extract weighted expression and sum
weights = np.array([weights[genes.index(g)] for g in valid_genes])
expr_sub = expr_matrix[valid_genes]
weighted_expr = expr_sub.multiply(weights, axis=1)
score_per_cell = weighted_expr.sum(axis=1)
drug_scores.loc[drug] = score_per_cell.values
# Convert scores to numeric (handle missing)
drug_scores = drug_scores.apply(pd.to_numeric, errors='coerce')
return drug_scores
Running 🏃💨¶
#takes around 1-2 minutes
drug_scores = score_drugs(expr_matrix, drug_target_df)
print(drug_scores.shape) # Number of drugs x number of cells
print(drug_scores.head())
/tmp/ipykernel_62984/2740834893.py:19: FutureWarning: DataFrameGroupBy.apply operated on the grouping columns. This behavior is deprecated, and in a future version of pandas the grouping columns will be excluded from the operation. Either pass `include_groups=False` to exclude the groupings or explicitly select the grouping columns after groupby to silence this warning.
drug_to_targets = drug_targets.groupby('drug_name').apply(
(143, 713121) cell 0 1 2 3 4 5 6 7 \ tariquidar NaN NaN NaN NaN NaN NaN NaN NaN tesmilifene NaN NaN NaN NaN NaN NaN NaN NaN bosutinib 7.02 1.17 1.17 2.34 0.0 1.17 1.17 5.85 dcc-2036 0.00 0.00 0.00 0.00 0.0 0.00 0.00 0.00 aderbasib NaN NaN NaN NaN NaN NaN NaN NaN cell 8 9 ... 713111 713112 713113 713114 713115 \ tariquidar NaN NaN ... NaN NaN NaN NaN NaN tesmilifene NaN NaN ... NaN NaN NaN NaN NaN bosutinib 4.68 0.0 ... 2.34 3.51 0.0 2.34 0.0 dcc-2036 0.00 0.0 ... 0.00 0.00 0.0 0.00 0.0 aderbasib NaN NaN ... NaN NaN NaN NaN NaN cell 713116 713117 713118 713119 713120 tariquidar NaN NaN NaN NaN NaN tesmilifene NaN NaN NaN NaN NaN bosutinib 0.0 1.17 0.0 0.0 0.0 dcc-2036 0.0 0.00 0.0 0.0 0.0 aderbasib NaN NaN NaN NaN NaN [5 rows x 713121 columns]
At this stage the data does not "know" which cell belongs to which subtype
The MeSH terms describe which drugs are associated with those diseases -» this infromation applies per drug, not per cell
Filtering the NaN data¶
# Remove drugs where all scores are NaN
drug_scores_filtered = drug_scores.dropna(how='all')
print(f"Filtered drug scores shape after removing drugs with all NaNs: {drug_scores_filtered.shape}")
Filtered drug scores shape after removing drugs with all NaNs: (66, 713121)
This has to be done because I messed some things up when trying to do the Squidpy
# For expression matrix
expr_matrix.index = expr_matrix.index.astype(str)
# For metadata
if meta_df.index.name != 'cell_id':
meta_df = meta_df.reset_index()
if 'cell_id' not in meta_df.columns:
meta_df['cell_id'] = meta_df.index.astype(str)
meta_df['cell_id'] = meta_df['cell_id'].astype(str).str.strip()
# For ssGSEA results or other annotations
ssgsea_results.res2d['Name'] = ssgsea_results.res2d['Name'].astype(str).str.strip()
meta_df = meta_df.set_index('cell_id')
meta_df_aligned = meta_df.loc[expr_matrix.index]
Now I have a matrix called "drug_scores_filtered":
each row = one drug
each column = once single cell singel-cell RNA-seq
values = each entry is a weighted sum of drug"s target genes" expression in that cell -» weigted by "interaction score"
Each number tells: for a given drug and a given cell, how “targetable” that cell is to that drug, in the sense that it highly expresses the genes/proteins that the drug is known to interact with.
If I treat this tumor or tissue with Drug X, how much of it is theoretically “druggable”?¶
Summarizing scores¶
# Mean score per drug (across all cells)
drug_mean_scores = drug_scores_filtered.mean(axis=1).sort_values(ascending=False)
# Max score per drug (shows each drug's best potential hit)
drug_max_scores = drug_scores_filtered.max(axis=1).sort_values(ascending=False)
Visualizing top drugs by mean score¶
Showing top drugs¶
drug_max_scores = drug_scores_filtered.max(axis=1).sort_values(ascending=False)
top10_mean = drug_mean_scores.head(10)
# Convert the Series to a DataFrame for easy use with hue
df = pd.DataFrame({
'Drug Name': top10_mean.index,
'Mean Drug Target Score': top10_mean.values
})
plt.figure(figsize=(10, 6))
sns.barplot(
x='Mean Drug Target Score',
y='Drug Name',
hue='Drug Name',
data=df,
palette='viridis',
legend=False
)
plt.xlabel("Mean Drug Target Score")
plt.ylabel("Drug Name")
plt.title("Top 10 Drugs by Mean Target Expression in Cells")
plt.tight_layout()
plt.show()
print("Top drugs by mean targetability:")
print(top10_mean)
drug_mean_scores.to_csv("DrugTarget2cell_Drug_Ranking_Mean.csv")
drug_max_scores.to_csv("DrugTarget2cell_Drug_Ranking_Max.csv")
Top drugs by mean targetability: mm-121 35.798636 trastuzumab 23.785497 margetuximab 22.652854 pol-6326 16.733455 pertuzumab 15.479450 lapatinib 13.234511 lee011 12.593070 g1t38 12.276789 ly2835219 10.186286 varlitinib 9.561961 dtype: float64
print("Top 10 drugs by mean target expression:")
print(drug_mean_scores.head(10))
print("Top 10 drugs by max target expression:")
print(drug_max_scores.head(10))
drug_mean_scores.to_csv("DrugTarget2cell_Drug_Ranking_Mean.csv")
drug_max_scores.to_csv("DrugTarget2cell_Drug_Ranking_Max.csv")
Top 10 drugs by mean target expression: mm-121 35.798636 trastuzumab 23.785497 margetuximab 22.652854 pol-6326 16.733455 pertuzumab 15.479450 lapatinib 13.234511 lee011 12.593070 g1t38 12.276789 ly2835219 10.186286 varlitinib 9.561961 dtype: float64 Top 10 drugs by max target expression: mm-121 1782.90 bavencio 1074.15 trastuzumab 803.25 pol-6326 802.62 margetuximab 765.00 as-1402 636.40 pertuzumab 522.75 atezolizumab 477.00 g1t38 438.05 lapatinib 415.44 dtype: float64
Distribution of a single drug's target score across all cells¶
# Replace 'drug_scores_filtered' with your actual DataFrame
drug_name = 'palbociclib' # Example drug name, change as needed
if drug_name in drug_scores_filtered.index:
scores = drug_scores_filtered.loc[drug_name].dropna()
plt.figure(figsize=(8, 5))
sns.histplot(scores, bins=50, kde=True, color='blue')
plt.title(f'Distribution of Target Scores for {drug_name}')
plt.xlabel('Drug Target Score')
plt.ylabel('Number of Cells')
plt.tight_layout()
plt.show()
else:
print(f"Drug {drug_name} not found in your score matrix.")
Thoughts?
- A bunch of cells with a low targetability score, but there is a subset of cells, that are highly expresses. These cells could be rare cell types?
- Seems like, this is quite logical? -» most of the cells cannnot be ideal target for any one drug -» but, subpopulations can be hhgly "druggable+ by some kind of specific compounds
- if i could somehow identify which cell types or clusters corrrespond to the high-scoring ccells I could understand which tumor parts Pablo may be most effective agains -» HOW?
Top drugs in each subtype¶
df = pd.read_csv("UPD_DrugTarget_BreastCancer_STRICT.csv")
df['drug_name'] = df['drug_name'].str.lower()
# Group by subtype and get the top N drugs per subtype based on interaction_score
top_drugs_per_subtype = {}
top_n = 5 # change this to select your desired top N
for subtype in df['mesh_id'].unique():
sub_df = df[df['mesh_id'] == subtype]
# Get mean (or max) score per drug for this subtype
drug_scores = sub_df.groupby('drug_name')['interaction_score'].mean()
# Sort and take the top N
top_drugs = drug_scores.sort_values(ascending=False).head(top_n).index.tolist()
top_drugs_per_subtype[subtype] = top_drugs
print("Top drugs per breast cancer subtype:")
for subtype, drugs in top_drugs_per_subtype.items():
print(f"{subtype}: {drugs}")
all_drugs = set()
plot_data = []
for subtype, drugs in top_drugs_per_subtype.items():
for d in drugs:
plot_data.append({'Subtype': subtype, 'Drug': d})
all_drugs.add(d)
df_plot = pd.DataFrame(plot_data)
plt.figure(figsize=(12, 7))
sns.countplot(data=df_plot, y='Drug', hue='Subtype', dodge=True, palette='Set2', order=list(all_drugs))
plt.title('Top Drugs Across Breast Cancer Subtypes')
plt.xlabel('Count (Number of Subtypes)')
plt.ylabel('Drug Name')
plt.legend(title='Subtype')
plt.tight_layout()
plt.show()
Top drugs per breast cancer subtype: Malignant neoplasm of breast[MeSHID:D001943]: ['denosumab', 'arcitumomab', 'bavencio', 'custirsen', 'as-1402'] Triple Negative Breast Neoplasms[MeSHID:D064726]: ['sar-566658', 'atezolizumab', 'eft508', 'bemcentinib', 'nivolumab'] Inflammatory Breast Carcinoma[MeSHID:D058922]: ['palbociclib']
Thoughts?
- many drugs are top hits of only a single subtypw, some of them are only among the top in the "malignant neoplams" category
- my candidate drugs are highly specific to the breast cancer subtype
Overlap counts of top drugs¶
labels = ['All subtypes overlap', 'Present in All & Malignant', 'Present in TNBC only', 'Present in Malignant only', 'Present in Inflammatory only']
counts = [0, 2, 6, 5, 1]
plt.figure(figsize=(10, 5))
plt.bar(labels, counts, color='steelblue')
plt.xticks(rotation=45, ha='right')
plt.ylabel('Number of Drugs')
plt.title('Overlap of Top Drugs Across Breast Cancer Subtypes')
plt.tight_layout()
plt.show()
Just checking for some specific drugs:)¶
drug_target_df = pd.read_csv("UPD_DrugTarget_BreastCancer_STRICT.csv")
query = 'palbociclib' # Example drug name, change as needed
matches = drug_target_df[drug_target_df['drug_name'].str.lower().str.contains(query)]
print(f"Found {matches.shape[0]} entries for '{query}':")
print(matches)
Found 6 entries for 'palbociclib':
old_row_number mesh_id \
46 538 Triple Negative Breast Neoplasms[MeSHID:D064726]
47 552 Malignant neoplasm of breast[MeSHID:D001943]
48 553 Inflammatory Breast Carcinoma[MeSHID:D058922]
58 612 Triple Negative Breast Neoplasms[MeSHID:D064726]
59 626 Malignant neoplasm of breast[MeSHID:D001943]
60 627 Inflammatory Breast Carcinoma[MeSHID:D058922]
drug_name target_name gene_name interaction_score
46 palbociclib cyclin-dependent kinase 4 CDK4 0.71
47 palbociclib cyclin-dependent kinase 4 CDK4 0.71
48 palbociclib cyclin-dependent kinase 4 CDK4 0.71
58 palbociclib cyclin-dependent kinase 6 CDK6 0.54
59 palbociclib cyclin-dependent kinase 6 CDK6 0.54
60 palbociclib cyclin-dependent kinase 6 CDK6 0.54
Drug synonyms¶
Since I am using a lot of different databases the names of the drugs could be different, so by implicating the synonym dictionary I tend minimise the chance of getting low quality results.
import requests
from collections import defaultdict
import time
#takes around 2 minutes
def get_chembl_synonyms(drug_name):
"""
Query ChEMBL API to retrieve synonyms of a drug name.
Returns a set of synonyms including the original name.
"""
url = f'https://www.ebi.ac.uk/chembl/api/data/molecule/search.json?q={drug_name}'
try:
r = requests.get(url, timeout=10)
r.raise_for_status()
data = r.json()
synonyms = set([drug_name.lower()]) # Include original name
if 'molecules' in data and len(data['molecules']) > 0:
for molecule in data['molecules']:
# Molecule synonyms
for syn in molecule.get('molecule_synonyms', []):
synonyms.add(syn['synonym'].lower())
return synonyms
except Exception as e:
print(f"Warning: Could not fetch synonyms for '{drug_name}': {str(e)}")
return set([drug_name.lower()])
def build_synonym_dict(drug_names_list):
"""
For a list of drug names, build a synonym dictionary by querying ChEMBL.
"""
synonym_dict = defaultdict(set)
for drug in drug_names_list:
# Get synonyms from ChEMBL
syns = get_chembl_synonyms(drug)
canonical = drug.lower()
synonym_dict[canonical].update(syns)
# Be respectful to server; avoid too rapid queries
time.sleep(0.5)
return synonym_dict
# Example integration
# Load your filtered ranked drug list (replace with your actual file)
drug_list_df = pd.read_csv('DrugTarget2cell_Drug_Ranking_Mean.csv', index_col=0)
# Extract drug names (index if you saved rankings as Series)
drug_names = drug_list_df.index.tolist()
# Build synonym dictionary automatically via ChEMBL API
syn_dict = build_synonym_dict(drug_names)
# Print a sample of synonyms for verification
for can_name, syns in list(syn_dict.items())[:5]:
print(f"{can_name} -> {sorted(syns)}")
# You can then use this syn_dict to normalize and map your drug names in downstream analyses.
# Example function to map input drug to canonical form
def map_drug_to_canonical(drug_name, synonym_dict):
drug_lower = drug_name.lower()
for canonical, synonyms in synonym_dict.items():
if drug_lower in synonyms:
return canonical
return drug_lower
# Map your ranked list to canonical drug names
drug_list_df['canonical_drug'] = drug_list_df.index.map(lambda d: map_drug_to_canonical(d, syn_dict))
print(drug_list_df.head())
Warning: Could not fetch synonyms for 'mm-121': 'synonym'
Warning: Could not fetch synonyms for 'trastuzumab': 'synonym'
Warning: Could not fetch synonyms for 'margetuximab': 'synonym'
Warning: Could not fetch synonyms for 'pol-6326': 'synonym'
Warning: Could not fetch synonyms for 'pertuzumab': 'synonym'
Warning: Could not fetch synonyms for 'lapatinib': 'synonym'
Warning: Could not fetch synonyms for 'lee011': 'synonym'
Warning: Could not fetch synonyms for 'g1t38': 'synonym'
Warning: Could not fetch synonyms for 'ly2835219': 'synonym'
Warning: Could not fetch synonyms for 'varlitinib': 'synonym'
Warning: Could not fetch synonyms for 'azd5363': 'synonym'
Warning: Could not fetch synonyms for 'tucatinib': 'synonym'
Warning: Could not fetch synonyms for 'palbociclib': 'synonym'
Warning: Could not fetch synonyms for 'balixafortide': 'synonym'
Warning: Could not fetch synonyms for 'hki-272': 'synonym'
Warning: Could not fetch synonyms for 'gdc-0068': 'synonym'
Warning: Could not fetch synonyms for 'ribociclib succinate': 'synonym'
Warning: Could not fetch synonyms for 'mm-111': 'synonym'
Warning: Could not fetch synonyms for 'abemaciclib': 'synonym'
Warning: Could not fetch synonyms for 'azd8931': 'synonym'
Warning: Could not fetch synonyms for 'trilaciclib': 'synonym'
Warning: Could not fetch synonyms for 'bavencio': 'synonym'
Warning: Could not fetch synonyms for 'ljm716': 'synonym'
Warning: Could not fetch synonyms for 'spi-2012': 'synonym'
Warning: Could not fetch synonyms for 'as-1402': 'synonym'
Warning: Could not fetch synonyms for 'mm-302': 'synonym'
Warning: Could not fetch synonyms for 'sndx-275': 'synonym'
Warning: Could not fetch synonyms for 'ertumaxomab': 'synonym'
Warning: Could not fetch synonyms for 'atezolizumab': 'synonym'
Warning: Could not fetch synonyms for 'kw-2450': 'synonym'
Warning: Could not fetch synonyms for 'ave-1642': 'synonym'
Warning: Could not fetch synonyms for 'sar-566658': 'synonym'
Warning: Could not fetch synonyms for 'mcs-110': 'synonym'
Warning: Could not fetch synonyms for 'mcla-128': 'synonym'
Warning: Could not fetch synonyms for 'trastuzumab-dm1': 'synonym'
Warning: Could not fetch synonyms for 'seliciclib': 'synonym'
Warning: Could not fetch synonyms for 'pd-0325901': 'synonym'
Warning: Could not fetch synonyms for 'abexinostat': 'synonym'
Warning: Could not fetch synonyms for 'bosutinib': 'synonym'
Warning: Could not fetch synonyms for 'neratinib': 'synonym'
Warning: Could not fetch synonyms for 'xentuzumab': 'synonym'
Warning: Could not fetch synonyms for 'canertinib': 'synonym'
Warning: Could not fetch synonyms for 'enmd-2076': 'synonym'
Warning: Could not fetch synonyms for 'gdc-0032': 'synonym'
Warning: Could not fetch synonyms for 'alpelisib': 'synonym'
Warning: Could not fetch synonyms for 'buparlisib': 'synonym'
Warning: Could not fetch synonyms for 'paclitaxel': 'synonym'
Warning: Could not fetch synonyms for 'taxol': 'synonym'
Warning: Could not fetch synonyms for 'docetaxel': 'synonym'
Warning: Could not fetch synonyms for 'xl880': 'synonym'
Warning: Could not fetch synonyms for 'gdc-0077': 'synonym'
Warning: Could not fetch synonyms for 'ipi-549': 'synonym'
Warning: Could not fetch synonyms for 'brivanib': 'synonym'
Warning: Could not fetch synonyms for 'everolimus': 'synonym'
Warning: Could not fetch synonyms for 'yn-968d1': 'synonym'
Warning: Could not fetch synonyms for 'ibuprofen': 'synonym'
Warning: Could not fetch synonyms for 'nivolumab': 'synonym'
Warning: Could not fetch synonyms for 'pembrolizumab': 'synonym'
Warning: Could not fetch synonyms for 'temsirolimus': 'synonym'
Warning: Could not fetch synonyms for 'trastuzumab emtansine': 'synonym'
Warning: Could not fetch synonyms for 'mgcd516': 'synonym'
Warning: Could not fetch synonyms for 'su-14813': 'synonym'
Warning: Could not fetch synonyms for 'dcc-2036': 'synonym'
Warning: Could not fetch synonyms for 'ink128': 'synonym'
Warning: Could not fetch synonyms for 'tki258': 'synonym'
Warning: Could not fetch synonyms for 'me-344': 'synonym'
mm-121 -> ['mm-121']
trastuzumab -> ['trastuzumab']
margetuximab -> ['margetuximab']
pol-6326 -> ['pol-6326']
pertuzumab -> ['pertuzumab']
0 canonical_drug
mm-121 35.798636 mm-121
trastuzumab 23.785497 trastuzumab
margetuximab 22.652854 margetuximab
pol-6326 16.733455 pol-6326
pertuzumab 15.479450 pertuzumab
After calling this API and creating a dictionary I still see, that some drugs are "underrepresented" and I seems like they don't have any synonyms. However, I know, that they have synonyms. So I created a separate "manual_synonyms" dictionary, which can be manually "updated".
manual_synonyms = {
'mm-121': ['mm-121', 'seribantumab'],
'trastuzumab': ['trastuzumab', 'herceptin', 'trastuzumab emtansine', 'trastuzumab-dm1', 'kadcyla'],
'pol-6326': ['pol-6326', 'balixafortide'],
'pertuzumab': ['pertuzumab', 'perjeta'],
'lapatinib': ['lapatinib', 'tykerb', 'tyverb'],
'lee011': ['lee011', 'dalpiciclib'],
'g1t38': ['g1t38', 'margetuximab'],
'azd5363': ['azd5363', 'capivasertib'],
'tucatinib': ['tucatinib', 'tukysa'],
'palbociclib': ['palbociclib', 'ibrance'],
'balixafortide': ['balixafortide'],
'hki-272': ['hki-272', 'neratinib'],
'gdc-0068': ['gdc-0068', 'ipatasertib'],
'ribociclib succinate': ['ribociclib succinate', 'kisqali'],
'abemaciclib': ['abemaciclib', 'verzenio'],
'azd8931': ['azd8931', 'sapitinib'],
'bavencio': ['bavencio', 'avelumab'],
'sndx-275': ['sndx-275', 'entinostat'],
'atezolizumab': ['atezolizumab', 'tecentriq'],
'pd-0325901': ['pd-0325901', 'mirdametinib'],
'bosutinib': ['bosutinib', 'bosulif'],
'neratinib': ['neratinib', 'hki-272'],
'gdc-0032': ['gdc-0032', 'umbralisib'],
'alpelisib': ['alpelisib', 'piqray'],
'paclitaxel': ['paclitaxel', 'taxol'],
'taxol': ['taxol', 'paclitaxel'],
'xl880': ['xl880', 'foretinib'],
'gdc-0077': ['gdc-0077', 'abemaciclib'],
'everolimus': ['everolimus', 'afinitor'],
'nivolumab': ['nivolumab', 'opdivo'],
'pembrolizumab': ['pembrolizumab', 'keytruda'],
'ink128': ['ink128', 'sapanisertib'],
}
print("Manual synonyms with new entries added, ready for integration into your pipeline.")
Manual synonyms with new entries added, ready for integration into your pipeline.
Comparing drug candidates across all breast cancer and specific subtypes¶
Summarising top drugs / Subtype¶
drug_target_df = pd.read_csv("UPD_DrugTarget_BreastCancer_STRICT.csv")
drug_mean_scores = pd.read_csv("DrugTarget2cell_Drug_Ranking_Mean.csv", index_col=0).squeeze("columns")
# Identifying unique breast cancer subtypes
subtypes = drug_target_df['mesh_id'].unique()
# Finding top 10 drugs overall
top_all = drug_mean_scores.sort_values(ascending=False).head(10).index.tolist()
print("Top drugs for all breast cancer types:")
print(top_all)
# Finding top 10 drugs for each subtype
top_drugs_per_subtype = {}
for subtype in subtypes:
drugs_in_subtype = drug_target_df[drug_target_df['mesh_id'] == subtype]['drug_name'].unique()
scores_for_subtype = drug_mean_scores.loc[drug_mean_scores.index.isin(drugs_in_subtype)]
top_drugs_per_subtype[subtype] = scores_for_subtype.sort_values(ascending=False).head(10).index.tolist()
print(f"\nTop drugs for {subtype}:")
print(top_drugs_per_subtype[subtype])
# Finding overlaps and unique drugs per subtype
overlap = set(top_all)
for drugs in top_drugs_per_subtype.values():
overlap = overlap.intersection(set(drugs))
print(f"\nDrugs common across all compared sets: {sorted(overlap)}")
unique_per_subtype = {}
for subtype, drugs in top_drugs_per_subtype.items():
others = set(top_all) | set([d for s, dlist in top_drugs_per_subtype.items() if s != subtype for d in dlist])
unique_per_subtype[subtype] = sorted(set(drugs) - others)
print(f"\nUnique drugs for {subtype}: {unique_per_subtype[subtype]}")
Top drugs for all breast cancer types: ['mm-121', 'trastuzumab', 'margetuximab', 'pol-6326', 'pertuzumab', 'lapatinib', 'lee011', 'g1t38', 'ly2835219', 'varlitinib'] Top drugs for Malignant neoplasm of breast[MeSHID:D001943]: ['mm-121', 'trastuzumab', 'margetuximab', 'pol-6326', 'pertuzumab', 'lapatinib', 'lee011', 'g1t38', 'ly2835219', 'varlitinib'] Top drugs for Triple Negative Breast Neoplasms[MeSHID:D064726]: ['azd5363', 'palbociclib', 'trilaciclib', 'atezolizumab', 'sar-566658', 'mcs-110', 'enmd-2076', 'nivolumab', 'pembrolizumab', 'tki258'] Top drugs for Inflammatory Breast Carcinoma[MeSHID:D058922]: ['palbociclib'] Drugs common across all compared sets: [] Unique drugs for Malignant neoplasm of breast[MeSHID:D001943]: [] Unique drugs for Triple Negative Breast Neoplasms[MeSHID:D064726]: ['atezolizumab', 'azd5363', 'enmd-2076', 'mcs-110', 'nivolumab', 'pembrolizumab', 'sar-566658', 'tki258', 'trilaciclib'] Unique drugs for Inflammatory Breast Carcinoma[MeSHID:D058922]: []
DrugSignature2drug Analysis¶
Utils¶
class GeneMatcher:
def __init__(self, expr_genes, threshold=90):
"""
Initialize the GeneMatcher with a list of gene names from the expression matrix
and the fuzzy matching score threshold.
Args:
expr_genes (list): List of gene names in the expression matrix.
threshold (int): Fuzzy match score cutoff (default: 90).
"""
self.expr_genes = expr_genes
self.threshold = threshold
self.mg = mygene.MyGeneInfo()
def get_gene_alias(self, gene):
"""
Query gene aliases using the MyGeneInfo database.
Args:
gene (str): The gene symbol to look up.
Returns:
list or None: List of aliases if found, else None.
"""
print(f"Querying alias for gene: {gene}")
gene_info = self.mg.query(gene, fields="alias", species="human")
gene_alias = None
if gene_info["hits"]:
for hit in gene_info["hits"]:
if "alias" in hit:
gene_alias = hit["alias"]
if not isinstance(gene_alias, list):
gene_alias = [gene_alias]
print(f"Found alias for gene {gene}: {gene_alias}")
break
if not gene_alias:
print(f"No alias found for gene: {gene}")
return gene_alias
def match_genes_with_alias(self, gene_list):
"""
Match a list of input genes to genes in the expression matrix,
using fuzzy matching and alias lookup if needed.
Args:
gene_list (list): List of input gene names to be matched.
Returns:
dict: Mapping from input gene names to matched expression matrix gene names.
"""
matched_genes = {}
for gene in gene_list:
# Try fuzzy matching the gene name directly
match = process.extractOne(gene, self.expr_genes, score_cutoff=self.threshold)
if match:
matched_genes[gene] = match[0]
print(f"Matched gene {gene} to {match[0]}")
else:
# Try matching gene aliases if direct match failed
gene_alias = self.get_gene_alias(gene)
if gene_alias:
for alias in gene_alias:
match = process.extractOne(alias, self.expr_genes, score_cutoff=self.threshold)
if match:
matched_genes[gene] = match[0]
print(f"Matched alias {alias} of gene {gene} to {match[0]}")
break
if gene not in matched_genes:
print(f"No match found for gene {gene} or its aliases")
print("-" * 50)
return matched_genes
Preprocessing¶
#could take up to 2 minutes to run
# Read the gene expression matrix from a CSV file
#this csv file is the dataset about gene expression in breast cancer cells
#it was retreived from: https://info.vizgen.com/ffpe-showcase?submissionGuid=18e2454a-cc07-4faa-93c0-b86ae41762af
expr_matrix = pd.read_csv("/share/ulli/HumanBreastCancerPatient1_cell_by_gene.csv")
print(expr_matrix.shape)
print(expr_matrix.head())
# Extract the "cell" column as metadata
metadata = expr_matrix[["cell"]]
# Drop the "cell" column from the expression matrix
expr_matrix = expr_matrix.drop(columns=["cell"])
# Convert all gene names to uppercase for consistency
expr_matrix.columns = expr_matrix.columns.str.upper()
print(expr_matrix.columns)
# Remove all genes whose names start with "BLANK"
expr_matrix = expr_matrix.loc[:, ~expr_matrix.columns.str.startswith("BLANK")]
print(expr_matrix.shape)
# Get the list of gene names present in the expression matrix
expr_genes = list(expr_matrix.columns)
(713121, 551)
cell PDK4 CCL26 CX3CL1 PGLYRP1 CD4 SNAI2 TNFRSF17 ICAM3 TBX21 \
0 0 2.0 0.0 5.0 0.0 0.0 0.0 1.0 0.0 0.0
1 1 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0
2 2 1.0 0.0 2.0 0.0 0.0 0.0 0.0 0.0 0.0
3 3 0.0 0.0 4.0 0.0 0.0 0.0 1.0 0.0 0.0
4 4 0.0 0.0 0.0 0.0 0.0 0.0 1.0 0.0 0.0
... Blank-41 Blank-42 Blank-43 Blank-44 Blank-45 Blank-46 Blank-47 \
0 ... 0.0 0.0 0.0 0.0 0.0 0.0 0.0
1 ... 0.0 0.0 0.0 0.0 0.0 0.0 0.0
2 ... 0.0 0.0 0.0 0.0 0.0 0.0 0.0
3 ... 0.0 0.0 0.0 0.0 0.0 0.0 0.0
4 ... 0.0 0.0 0.0 0.0 0.0 0.0 0.0
Blank-48 Blank-49 Blank-50
0 1.0 0.0 0.0
1 0.0 0.0 0.0
2 0.0 0.0 0.0
3 1.0 0.0 0.0
4 0.0 0.0 0.0
[5 rows x 551 columns]
Index(['PDK4', 'CCL26', 'CX3CL1', 'PGLYRP1', 'CD4', 'SNAI2', 'TNFRSF17',
'ICAM3', 'TBX21', 'FAP',
...
'BLANK-41', 'BLANK-42', 'BLANK-43', 'BLANK-44', 'BLANK-45', 'BLANK-46',
'BLANK-47', 'BLANK-48', 'BLANK-49', 'BLANK-50'],
dtype='object', length=550)
(713121, 500)
So, what i see, is that a lot of drugs i selected as top drugs for treating BC are not present in the drug-gene dataset. It is challenging to find a bingo: a drug, which is still effective against BC and has a relevant data about drug target signatures. Thus, possible solution:
- the analysis performed earlier to select the best drugs stays as it is
- i use the dataset with drugs and their targets and sort of combine it with that "the most effective drug" database
- once it is combined, i 'extract' those drugs, which are present in both sources
- i create a ranking once more and get a list of top10 drug effective against BC, which have relevant drug-target data
Query DGIdb for drug-gene interactions related to BC¶
I found one source, the DGIdb, which shows me the genes which are targeted by a selected drug.
# Load DGIdb interaction data (adjust file extension if needed)
#file downloaded from: https://dgidb.org/downloads
dgidb_path = "drug_gene_interactions.tsv"
dgidb_df = pd.read_csv(dgidb_path, sep='\t')
# Preview columns and shape
print(f"Columns in DGIdb {list(dgidb_df.columns)}")
print(f"Data shape: {dgidb_df.shape}")
print(dgidb_df.head())
# Load your strict drug-target database
strict_path = "UPD_DrugTarget_BreastCancer_STRICT.csv"
strict_df = pd.read_csv(strict_path)
# Normalize column names to lowercase
dgidb_df.columns = dgidb_df.columns.str.lower()
strict_df.columns = strict_df.columns.str.lower()
# Uppercase target genes for uniformity
dgidb_df['gene_name'] = dgidb_df['gene_name'].str.upper()
strict_df['gene_name'] = strict_df['gene_name'].str.upper()
# Lowercase drug names for uniformity
dgidb_df['drug_name'] = dgidb_df['drug_name'].str.lower()
strict_df['drug_name'] = strict_df['drug_name'].str.lower()
# Filter DGIdb to keep interactions where gene is in your strict database (expressed targets)
relevant_genes = set(strict_df['gene_name'].str.upper().unique())
dgidb_filtered = dgidb_df[dgidb_df['gene_name'].isin(relevant_genes)]
print(f"Filtered DGIdb interactions count: {len(dgidb_filtered)}")
# Further, select DGIdb entries targeting antineoplastic and/or approved drugs if needed
dgidb_filtered = dgidb_filtered[(dgidb_filtered['approved'] == True) | (dgidb_filtered['anti_neoplastic'] == True)]
print(f"Filtered DGIdb for approved or antineoplastic drugs count: {len(dgidb_filtered)}")
# Get unique drug names from DGIdb filtered set
dgidb_drugs = set(dgidb_filtered['drug_name'].unique())
print(f"Unique DGIdb drugs after filtering: {len(dgidb_drugs)}")
# Your existing manual synonyms dictionary (extend as needed)
manual_synonyms = {
'mm-121': ['mm-121', 'seribantumab'],
'trastuzumab': ['trastuzumab', 'herceptin', 'trastuzumab emtansine', 'trastuzumab-dm1', 'kadcyla'],
'pol-6326': ['pol-6326', 'balixafortide'],
'pertuzumab': ['pertuzumab', 'perjeta'],
'lapatinib': ['lapatinib', 'tykerb', 'tyverb'],
'lee011': ['lee011', 'dalpiciclib'],
'g1t38': ['g1t38', 'margetuximab'],
'azd5363': ['azd5363', 'capivasertib'],
'tucatinib': ['tucatinib', 'tukysa'],
'palbociclib': ['palbociclib', 'ibrance'],
'balixafortide': ['balixafortide'],
'hki-272': ['hki-272', 'neratinib'],
'gdc-0068': ['gdc-0068', 'ipatasertib'],
'ribociclib succinate': ['ribociclib succinate', 'kisqali'],
'abemaciclib': ['abemaciclib', 'verzenio'],
'azd8931': ['azd8931', 'sapitinib'],
'bavencio': ['bavencio', 'avelumab'],
'sndx-275': ['sndx-275', 'entinostat'],
'atezolizumab': ['atezolizumab', 'tecentriq'],
'pd-0325901': ['pd-0325901', 'mirdametinib'],
'bosutinib': ['bosutinib', 'bosulif'],
'neratinib': ['neratinib', 'hki-272'],
'gdc-0032': ['gdc-0032', 'umbralisib'],
'alpelisib': ['alpelisib', 'piqray'],
'paclitaxel': ['paclitaxel', 'taxol'],
'taxol': ['taxol', 'paclitaxel'],
'xl880': ['xl880', 'foretinib'],
'gdc-0077': ['gdc-0077', 'abemaciclib'],
'everolimus': ['everolimus', 'afinitor'],
'nivolumab': ['nivolumab', 'opdivo'],
'pembrolizumab': ['pembrolizumab', 'keytruda'],
'ink128': ['ink128', 'sapanisertib'],
}
# Build full synonym dictionary mapping lowercase drug to set of synonyms
from collections import defaultdict
synonym_dict = defaultdict(set)
for canonical, syns in manual_synonyms.items():
lower_canon = canonical.lower()
synonym_dict[lower_canon].update([s.lower() for s in syns])
# Add the canonical name itself if missing
for drug in list(synonym_dict.keys()):
synonym_dict[drug].add(drug)
# Function to check if any synonym of drug is in the DGIdb drugs
def drug_in_dgidb(drug, dgidb_set, syn_dict):
drug = drug.lower()
syns = syn_dict.get(drug, {drug})
return any(s in dgidb_set for s in syns)
dgidb_drug_set = set(dgidb_filtered['drug_name'].unique())
strict_drug_set = set(strict_df['drug_name'].unique())
common_drugs_with_synonyms = {drug for drug in strict_drug_set if drug_in_dgidb(drug, dgidb_drug_set, synonym_dict)}
print(f"Number of drugs common to strict DB and DGIdb after considering synonyms: {len(common_drugs_with_synonyms)}")
print(f"Sample drugs: {list(common_drugs_with_synonyms)[:10]}")
# Save or pass common_drugs_with_synonyms forward for drugSignature2cell or other analyses
Columns in DGIdb ['gene_claim_name', 'gene_concept_id', 'gene_name', 'interaction_source_db_name', 'interaction_source_db_version', 'interaction_type', 'interaction_score', 'drug_claim_name', 'drug_concept_id', 'drug_name', 'approved', 'immunotherapy', 'anti_neoplastic']
Data shape: (98239, 13)
gene_claim_name gene_concept_id gene_name interaction_source_db_name \
0 CYP2D6 hgnc:2625 CYP2D6 DTC
1 PPARG hgnc:9236 PPARG DTC
2 ATAD5 hgnc:25752 ATAD5 DTC
3 RGS4 hgnc:10000 RGS4 DTC
4 MAPK1 hgnc:6871 MAPK1 DTC
interaction_source_db_version interaction_type interaction_score \
0 9/2/20 NaN 0.017709
1 9/2/20 NaN 0.840123
2 9/2/20 NaN 0.177992
3 9/2/20 NaN 0.034319
4 9/2/20 NaN 0.050007
drug_claim_name drug_concept_id drug_name \
0 RACLOPRIDE ncit:C152139 RACLOPRIDE
1 KALOPANAX-SAPONIN F chembl:CHEMBL1833984 CHEMBL:CHEMBL1833984
2 RO-5-3335 chembl:CHEMBL91609 CHEMBL:CHEMBL91609
3 3,4-DICHLOROISOCOUMARIN drugbank:DB04459 3,4-DICHLOROISOCOUMARIN
4 WITHAFERIN A iuphar.ligand:13097 WITHAFERIN A
approved immunotherapy anti_neoplastic
0 False False False
1 False False False
2 False False False
3 False False False
4 False False False
Filtered DGIdb interactions count: 10477
Filtered DGIdb for approved or antineoplastic drugs count: 5839
Unique DGIdb drugs after filtering: 1400
Number of drugs common to strict DB and DGIdb after considering synonyms: 72
Sample drugs: ['abexinostat', 'everolimus', 'capecitabine', 'enmd-2076', 'taxol', 'dexrazoxane', 'onapristone', 'pralatrexate', 'tanespimycin', 'tucatinib']
print(strict_df.columns)
Index(['old_row_number', 'mesh_id', 'drug_name', 'target_name', 'gene_name',
'interaction_score'],
dtype='object')
At this point I:
- Loaded your strict breast cancer drug-target database,
- Filtered the DGIdb dataset to drugs targeting relevant genes, approved or antineoplastic,
- Incorporated drug synonyms to broaden matching beyond exact names
Extracting top 10 drugs from filtered database¶
from collections import defaultdict
synonym_dict = defaultdict(set)
for canonical, syns in manual_synonyms.items():
lower_canon = canonical.lower()
synonym_dict[lower_canon].update([s.lower() for s in syns])
for drug in list(synonym_dict.keys()):
synonym_dict[drug].add(drug)
# Create sets of drugs for matching
dgidb_drug_set = set(dgidb_filtered['drug_name'].unique())
strict_drug_set = set(strict_df['drug_name'].unique())
def drug_in_dgidb(drug, dgidb_set, syn_dict):
drug = drug.lower()
syns = syn_dict.get(drug, {drug})
return any(s in dgidb_set for s in syns)
# Define common_drugs_with_synonyms and its list
common_drugs_with_synonyms = {drug for drug in strict_drug_set if drug_in_dgidb(drug, dgidb_drug_set, synonym_dict)}
common_drugs_list = list(common_drugs_with_synonyms)
# Normalize drug_scores index and common_drugs_list for matching
normalized_common_drugs = [d.strip().lower() for d in common_drugs_list]
drug_scores_norm = drug_scores.rename(index=lambda x: x.strip().lower())
# Filter the normalized drug_scores by normalized common drugs
drug_mean_scores_filtered = drug_scores_norm.loc[
[d for d in normalized_common_drugs if d in drug_scores_norm.index]]
# Get top 10 drugs by mean target expression
top_10_drugs_by_expr = drug_mean_scores_filtered.sort_values(ascending=False).head(10)
# Print results
print(f"Number of drugs in common_drugs_list: {len(common_drugs_list)}")
print(f"Number of drugs in drug_scores index: {len(drug_scores.index)}")
print(f"Number of drugs overlap after normalization: {len(drug_mean_scores_filtered)}")
print("Top 10 drugs by mean target expression (drug scores) after normalization:")
print(top_10_drugs_by_expr)
Number of drugs in common_drugs_list: 72 Number of drugs in drug_scores index: 1 Number of drugs overlap after normalization: 1 Top 10 drugs by mean target expression (drug scores) after normalization: drug_name palbociclib 0.625 Name: interaction_score, dtype: float64
I want this list to be ranked based on their interaction score, also i want to see the synonyms of these drugs¶
# Normalize the common drugs list and drug_scores index for consistent matching
normalized_common_drugs = [d.strip().lower() for d in common_drugs_list]
drug_scores.index = drug_scores.index.str.strip().str.lower()
# Filter drug_scores Series to only the drugs in your normalized common drug list
drug_scores_filtered = drug_scores.loc[drug_scores.index.intersection(normalized_common_drugs)]
# Get top 10 drugs by mean target expression (drug scores)
top_10_drugs_by_expr = drug_scores_filtered.sort_values(ascending=False).head(10)
print("Top 10 drugs selected for drugSignature2cell analysis (ranked by expression-based drug scores):")
print(top_10_drugs_by_expr)
# Optionally print synonyms for these top drugs
print("\nSynonyms for top drugs:")
for drug in top_10_drugs_by_expr.index:
syns = synonym_dict.get(drug, [drug])
print(f"{drug}: {', '.join(sorted(syns))}")
Top 10 drugs selected for drugSignature2cell analysis (ranked by expression-based drug scores): drug_name palbociclib 0.625 Name: interaction_score, dtype: float64 Synonyms for top drugs: palbociclib: ibrance, palbociclib
Interaction Score = score of how good is the lock and key between drug and gene.(Reliable scientific evidence for drug-target interaction.)
Drug Score = how many locks are present and unlocked in your cells, weighted by the lock-key fit.(How many targets the drug can affect in your specific patient’s cells.)
So one drug it is: Palbociclib, or how I call it - Pablo, the smuggler (because he is a smuggler of a drug into the cells:)
Palbociclib - selected drug¶
palbo_df = pd.read_csv("Palbociclib_Drug_Gene.tsv", sep='\t')
# downloaded from here: https://dgidb.org/results?searchType=drug&searchTerms=PALBOCICLIB
# Normalizing gene names to uppercase and preview data
palbo_df['gene'] = palbo_df['gene'].str.upper()
print(f"Preview:\n{palbo_df.head()}")
# Use ALL genes without filtering by interaction score
palbo_genes = palbo_df['gene'].unique().tolist()
Preview:
drug gene regulatory approval interaction score
0 PALBOCICLIB MAPK8 Approved 0.020071
1 PALBOCICLIB CLK2 Approved 0.042650
2 PALBOCICLIB CDK4 Approved 0.191926
3 PALBOCICLIB RB1 Approved 0.398068
4 PALBOCICLIB ERBB2 Approved 0.020071
Mapping genes to expression matrix¶
# expr_matrix is your cell x gene expression matrix with columns in uppercase
signature_genes_in_matrix = [g for g in palbo_genes if g in expr_matrix.columns]
print(f"Signature genes in expression matrix: {signature_genes_in_matrix}")
Signature genes in expression matrix: ['CDK4', 'RB1', 'ERBB2', 'CDK6', 'PIK3CA', 'CCNE1', 'MET', 'TP53', 'PTEN', 'CHEK1', 'RET', 'KRAS', 'BRAF', 'NRAS', 'CHEK2', 'CCND1', 'SMARCA4']
Calculating expression scores per cell¶
signature_expr = expr_matrix[signature_genes_in_matrix]
palbo_cell_scores = signature_expr.mean(axis=1)
print(f"Palbociclib signature scores computed for {len(palbo_cell_scores)} cells.")
print(palbo_cell_scores.head())
Palbociclib signature scores computed for 713121 cells. 0 6.000000 1 0.823529 2 1.470588 3 4.470588 4 2.529412 dtype: float64
DrugSignature2cell¶
Loading the data¶
#could take up to 2 minutes to run
# Read the gene expression matrix from a CSV file
#this csv file is the dataset about gene expression in breast cancer cells
#it was retreived from: https://info.vizgen.com/ffpe-showcase?submissionGuid=18e2454a-cc07-4faa-93c0-b86ae41762af
expr_matrix = pd.read_csv("/share/ulli/HumanBreastCancerPatient1_cell_by_gene.csv")
print(expr_matrix.shape)
print(expr_matrix.head())
# Extract the "cell" column as metadata
metadata = expr_matrix[["cell"]]
# Drop the "cell" column from the expression matrix
expr_matrix = expr_matrix.drop(columns=["cell"])
# Convert all gene names to uppercase for consistency
expr_matrix.columns = expr_matrix.columns.str.upper()
print(expr_matrix.columns)
# Remove all genes whose names start with "BLANK"
expr_matrix = expr_matrix.loc[:, ~expr_matrix.columns.str.startswith("BLANK")]
print(expr_matrix.shape)
# Get the list of gene names present in the expression matrix
expr_genes = list(expr_matrix.columns)
(713121, 551)
cell PDK4 CCL26 CX3CL1 PGLYRP1 CD4 SNAI2 TNFRSF17 ICAM3 TBX21 \
0 0 2.0 0.0 5.0 0.0 0.0 0.0 1.0 0.0 0.0
1 1 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0
2 2 1.0 0.0 2.0 0.0 0.0 0.0 0.0 0.0 0.0
3 3 0.0 0.0 4.0 0.0 0.0 0.0 1.0 0.0 0.0
4 4 0.0 0.0 0.0 0.0 0.0 0.0 1.0 0.0 0.0
... Blank-41 Blank-42 Blank-43 Blank-44 Blank-45 Blank-46 Blank-47 \
0 ... 0.0 0.0 0.0 0.0 0.0 0.0 0.0
1 ... 0.0 0.0 0.0 0.0 0.0 0.0 0.0
2 ... 0.0 0.0 0.0 0.0 0.0 0.0 0.0
3 ... 0.0 0.0 0.0 0.0 0.0 0.0 0.0
4 ... 0.0 0.0 0.0 0.0 0.0 0.0 0.0
Blank-48 Blank-49 Blank-50
0 1.0 0.0 0.0
1 0.0 0.0 0.0
2 0.0 0.0 0.0
3 1.0 0.0 0.0
4 0.0 0.0 0.0
[5 rows x 551 columns]
Index(['PDK4', 'CCL26', 'CX3CL1', 'PGLYRP1', 'CD4', 'SNAI2', 'TNFRSF17',
'ICAM3', 'TBX21', 'FAP',
...
'BLANK-41', 'BLANK-42', 'BLANK-43', 'BLANK-44', 'BLANK-45', 'BLANK-46',
'BLANK-47', 'BLANK-48', 'BLANK-49', 'BLANK-50'],
dtype='object', length=550)
(713121, 500)
Finding signatures / correlated genes in the expression matrix¶
palbo_df = pd.read_csv("Palbociclib_Drug_Gene.tsv", sep="\t")
print(palbo_df.columns)
print(palbo_df.head())
Index(['drug', 'gene', 'regulatory approval', 'interaction score'], dtype='object')
drug gene regulatory approval interaction score
0 PALBOCICLIB MAPK8 Approved 0.020071
1 PALBOCICLIB CLK2 Approved 0.042650
2 PALBOCICLIB CDK4 Approved 0.191926
3 PALBOCICLIB RB1 Approved 0.398068
4 PALBOCICLIB ERBB2 Approved 0.020071
Up and down signatures¶
file_path = "Palbociclib_Drug_Gene.tsv"
palbo_df = pd.read_csv(file_path, sep='\t')
# Normalize gene names to uppercase (for consistency)
palbo_df['gene'] = palbo_df['gene'].str.upper()
# Compute median interaction score as threshold
threshold = palbo_df['interaction score'].median()
# Genes with interaction score above threshold: "up" signature
up_sig = palbo_df[palbo_df['interaction score'] > threshold][['gene']].copy()
up_sig['direction'] = 1 # Upregulated or strong targets
# Genes with interaction score below or equal to threshold: "down" signature
down_sig = palbo_df[palbo_df['interaction score'] <= threshold][['gene']].copy()
down_sig['direction'] = 0 # Downregulated or weak targets
# Combine signatures into one DataFrame
signature = pd.concat([up_sig, down_sig])
# Rename gene column to 'gene' for downstream consistency if needed
signature = signature.rename(columns={'gene': 'gene'})
# Save to a file (tab-separated, no header, no index)
signature.to_csv("palbociclib_signature_split_by_interaction_score.txt", sep='\t', index=False, header=False)
print(f"Signature saved. Number of upregulated genes: {len(up_sig)}, downregulated genes: {len(down_sig)}")
print("Upregulated (strong target) genes:")
print(up_sig['gene'].tolist())
print("\nDownregulated (weak target) genes:")
print(down_sig['gene'].tolist())
Signature saved. Number of upregulated genes: 24, downregulated genes: 27 Upregulated (strong target) genes: ['CLK2', 'CDK4', 'RB1', 'CDK6', 'PIK3CA', 'CCNE2', 'CCNE1', 'CCND2', 'CCND3', 'MST1R', 'EIF4EBP1', 'DYRK1A', 'DYRK1B', 'NRAS', 'LRRK2', 'CDKN2B', 'CHEK2', 'SMARCB1', 'PRKX', 'CCND1', 'CDKN2A', 'SMARCA4', 'NEK2', 'PAK4'] Downregulated (weak target) genes: ['MAPK8', 'ERBB2', 'ESR1', 'PRKAA1', 'ESR2', 'MET', 'DAPK3', 'TP53', 'PTEN', 'TAOK1', 'MAPK9', 'CDK9', 'CHEK1', 'RET', 'ROCK2', 'PGR', 'KRAS', 'BRAF', 'ALK', 'CDK5', 'PRKD3', 'RPS6KA3', 'FLT3', 'CLK4', 'MAP4K5', 'MAP4K2', 'MAP4K4']
Split the genes based on median interaction score — adaptable to other cutoffs (mean, quantiles, or a specific interaction score threshold)
# Assuming expr_genes is list of gene names in your expression matrix (uppercase)
matcher = GeneMatcher(expr_genes, threshold=90)
# Extract lists of gene names from your signature DataFrames
up_signatures = up_sig['gene'].tolist()
down_signatures = down_sig['gene'].tolist()
# Match with aliases
up_signatures_matched = matcher.match_genes_with_alias(up_signatures)
down_signatures_matched = matcher.match_genes_with_alias(down_signatures)
print(f"Matched up signatures: {len(up_signatures_matched)}, {list(up_signatures_matched.keys())}")
print(f"Matched down signatures: {len(down_signatures_matched)}, {list(down_signatures_matched.keys())}")
Input sequence provided is already in string format. No operation performed
Querying alias for gene: CLK2
Input sequence provided is already in string format. No operation performed
Found alias for gene CLK2: ['CLK2', 'TEL2', 'YHFS'] No match found for gene CLK2 or its aliases -------------------------------------------------- Matched gene CDK4 to CDK4 -------------------------------------------------- Matched gene RB1 to RB1 -------------------------------------------------- Matched gene CDK6 to CDK6 -------------------------------------------------- Matched gene PIK3CA to PIK3CA -------------------------------------------------- Querying alias for gene: CCNE2
Input sequence provided is already in string format. No operation performed
Found alias for gene CCNE2: ['CYCE2'] No match found for gene CCNE2 or its aliases -------------------------------------------------- Matched gene CCNE1 to CCNE1 -------------------------------------------------- Querying alias for gene: CCND2
Input sequence provided is already in string format. No operation performed
Found alias for gene CCND2: ['KIAK0002', 'MPPH3'] No match found for gene CCND2 or its aliases -------------------------------------------------- Querying alias for gene: CCND3
Input sequence provided is already in string format. No operation performed
No alias found for gene: CCND3 No match found for gene CCND3 or its aliases -------------------------------------------------- Querying alias for gene: MST1R
Input sequence provided is already in string format. No operation performed
Found alias for gene MST1R: ['CD136', 'CDw136', 'NPCA3', 'PTK8', 'RON', 'SEA', 'p185-Ron'] No match found for gene MST1R or its aliases -------------------------------------------------- Querying alias for gene: EIF4EBP1
Input sequence provided is already in string format. No operation performed
Found alias for gene EIF4EBP1: ['4E-BP1', '4EBP1', 'BP-1', 'PHAS-I'] No match found for gene EIF4EBP1 or its aliases -------------------------------------------------- Querying alias for gene: DYRK1A
Input sequence provided is already in string format. No operation performed
Found alias for gene DYRK1A: ['DYRK', 'DYRK1', 'HP86', 'MNB', 'MNBH', 'MRD7'] No match found for gene DYRK1A or its aliases -------------------------------------------------- Querying alias for gene: DYRK1B
Input sequence provided is already in string format. No operation performed
Found alias for gene DYRK1B: ['AOMS3', 'MIRK'] No match found for gene DYRK1B or its aliases -------------------------------------------------- Matched gene NRAS to NRAS -------------------------------------------------- Querying alias for gene: LRRK2
Input sequence provided is already in string format. No operation performed
Found alias for gene LRRK2: ['AURA17', 'DARDARIN', 'PARK8', 'RIPK7', 'ROCO2'] No match found for gene LRRK2 or its aliases -------------------------------------------------- Querying alias for gene: CDKN2B
Input sequence provided is already in string format. No operation performed
Found alias for gene CDKN2B: ['CDK4I', 'INK4B', 'MTS2', 'P15', 'TP15', 'p15INK4b'] No match found for gene CDKN2B or its aliases -------------------------------------------------- Matched gene CHEK2 to CHEK2 -------------------------------------------------- Querying alias for gene: SMARCB1
Input sequence provided is already in string format. No operation performed
Found alias for gene SMARCB1: ['BAF47', 'CSS3', 'INI-1', 'INI1', 'MRD15', 'PPP1R144', 'RDT', 'RTPS1', 'SNF5', 'SNF5L1', 'SWNTS1', 'Sfh1p', 'Snr1', 'hSNFS'] No match found for gene SMARCB1 or its aliases -------------------------------------------------- Querying alias for gene: PRKX
Input sequence provided is already in string format. No operation performed
Found alias for gene PRKX: ['PKX1'] No match found for gene PRKX or its aliases -------------------------------------------------- Matched gene CCND1 to CCND1 -------------------------------------------------- Querying alias for gene: CDKN2A
Input sequence provided is already in string format. No operation performed
Found alias for gene CDKN2A: ['ARF', 'CAI2', 'CDK4I', 'CDKN2', 'CMM2', 'INK4', 'INK4A', 'MLM', 'MTS-1', 'MTS1', 'P14', 'P14ARF', 'P16', 'P16-INK4A', 'P16INK4', 'P16INK4A', 'P19', 'P19ARF', 'TP16'] No match found for gene CDKN2A or its aliases -------------------------------------------------- Matched gene SMARCA4 to SMARCA4 -------------------------------------------------- Querying alias for gene: NEK2
Input sequence provided is already in string format. No operation performed
Found alias for gene NEK2: ['HsPK21', 'NEK2A', 'NLK1', 'PPP1R111', 'RP67'] No match found for gene NEK2 or its aliases -------------------------------------------------- Querying alias for gene: PAK4
Input sequence provided is already in string format. No operation performed
Found alias for gene PAK4: ['C3orf54', 'FAM212A', 'INCA'] No match found for gene PAK4 or its aliases -------------------------------------------------- Querying alias for gene: MAPK8
Input sequence provided is already in string format. No operation performed
Found alias for gene MAPK8: ['JNK', 'JNK-46', 'JNK1', 'JNK1A2', 'JNK21B1/2', 'PRKM8', 'SAPK1', 'SAPK1c'] No match found for gene MAPK8 or its aliases -------------------------------------------------- Matched gene ERBB2 to ERBB2 -------------------------------------------------- Querying alias for gene: ESR1
Input sequence provided is already in string format. No operation performed
Found alias for gene ESR1: ['ER', 'ESR', 'ESRA', 'ESTRR', 'Era', 'NR3A1'] Matched alias ER of gene ESR1 to SERPINE1 -------------------------------------------------- Querying alias for gene: PRKAA1
Input sequence provided is already in string format. No operation performed
Found alias for gene PRKAA1: ['AMPK', 'AMPK alpha 1', 'AMPKa1'] No match found for gene PRKAA1 or its aliases -------------------------------------------------- Querying alias for gene: ESR2
Input sequence provided is already in string format. No operation performed
Found alias for gene ESR2: ['ER-BETA', 'ESR-BETA', 'ESRB', 'ESTRB', 'Erb', 'NR3A2', 'ODG8'] No match found for gene ESR2 or its aliases -------------------------------------------------- Matched gene MET to MET -------------------------------------------------- Querying alias for gene: DAPK3
Input sequence provided is already in string format. No operation performed
Found alias for gene DAPK3: ['DLK', 'ZIP', 'ZIPK'] No match found for gene DAPK3 or its aliases -------------------------------------------------- Matched gene TP53 to TP53 -------------------------------------------------- Matched gene PTEN to PTEN -------------------------------------------------- Querying alias for gene: TAOK1
Input sequence provided is already in string format. No operation performed
Found alias for gene TAOK1: ['DDIB', 'KFC-B', 'MAP3K16', 'MARKK', 'PSK-2', 'PSK2', 'TAO1', 'hKFC-B', 'hTAOK1'] No match found for gene TAOK1 or its aliases -------------------------------------------------- Querying alias for gene: MAPK9
Input sequence provided is already in string format. No operation performed
Found alias for gene MAPK9: ['JNK-55', 'JNK2', 'JNK2A', 'JNK2ALPHA', 'JNK2B', 'JNK2BETA', 'PRKM9', 'SAPK', 'SAPK1a', 'p54a', 'p54aSAPK'] No match found for gene MAPK9 or its aliases -------------------------------------------------- Querying alias for gene: CDK9
Input sequence provided is already in string format. No operation performed
Found alias for gene CDK9: ['C-2k', 'CDC2L4', 'CTK1', 'PITALRE', 'TAK'] No match found for gene CDK9 or its aliases -------------------------------------------------- Matched gene CHEK1 to CHEK1 -------------------------------------------------- Matched gene RET to RET -------------------------------------------------- Querying alias for gene: ROCK2
Input sequence provided is already in string format. No operation performed
Found alias for gene ROCK2: ['ROCK-II'] No match found for gene ROCK2 or its aliases -------------------------------------------------- Querying alias for gene: PGR
Input sequence provided is already in string format. No operation performed
Found alias for gene PGR: ['NR3C3', 'PR'] Matched alias PR of gene PGR to PROX1 -------------------------------------------------- Matched gene KRAS to KRAS -------------------------------------------------- Matched gene BRAF to BRAF -------------------------------------------------- Querying alias for gene: ALK
Input sequence provided is already in string format. No operation performed
Found alias for gene ALK: ['ALK1', 'CD246', 'NBLST3'] Matched alias CD246 of gene ALK to CD2 -------------------------------------------------- Querying alias for gene: CDK5
Input sequence provided is already in string format. No operation performed
Found alias for gene CDK5: ['LIS7', 'PSSALRE'] No match found for gene CDK5 or its aliases -------------------------------------------------- Querying alias for gene: PRKD3
Input sequence provided is already in string format. No operation performed
Found alias for gene PRKD3: ['EPK2', 'PKC-NU', 'PKD3', 'PRKCN', 'nPKC-NU'] No match found for gene PRKD3 or its aliases -------------------------------------------------- Querying alias for gene: RPS6KA3
Input sequence provided is already in string format. No operation performed
Found alias for gene RPS6KA3: ['CLS', 'HU-3', 'ISPK-1', 'MAPKAPK1B', 'MRX19', 'RSK', 'RSK2', 'S6K-alpha3', 'XLID19', 'p90-RSK2', 'pp90RSK2'] No match found for gene RPS6KA3 or its aliases -------------------------------------------------- Querying alias for gene: FLT3
Input sequence provided is already in string format. No operation performed
Found alias for gene FLT3: ['CD135', 'FLK-2', 'FLK2', 'STK1'] No match found for gene FLT3 or its aliases -------------------------------------------------- Querying alias for gene: CLK4
Input sequence provided is already in string format. No operation performed
Found alias for gene CLK4: ['CLASP', 'SFRS16', 'SWAP2'] No match found for gene CLK4 or its aliases -------------------------------------------------- Querying alias for gene: MAP4K5
Input sequence provided is already in string format. No operation performed
Found alias for gene MAP4K5: ['GCKR', 'KHS', 'KHS1', 'MAPKKKK5'] No match found for gene MAP4K5 or its aliases -------------------------------------------------- Querying alias for gene: MAP4K2
Input sequence provided is already in string format. No operation performed
Found alias for gene MAP4K2: ['BL44', 'GCK', 'RAB8IP'] No match found for gene MAP4K2 or its aliases -------------------------------------------------- Querying alias for gene: MAP4K4 Found alias for gene MAP4K4: ['FLH21957', 'HEL-S-31', 'HGK', 'MEKKK4', 'NIK'] No match found for gene MAP4K4 or its aliases -------------------------------------------------- Matched up signatures: 9, ['CDK4', 'RB1', 'CDK6', 'PIK3CA', 'CCNE1', 'NRAS', 'CHEK2', 'CCND1', 'SMARCA4'] Matched down signatures: 11, ['ERBB2', 'ESR1', 'MET', 'TP53', 'PTEN', 'CHEK1', 'RET', 'PGR', 'KRAS', 'BRAF', 'ALK']
gene_sets = {
"up_genes": list(up_signatures_matched.values()),
"down_genes": list(down_signatures_matched.values())
}
print("Matched down signatures:", list(down_signatures_matched.values()))
print("Down genes for GSEA (from gene_sets):", gene_sets["down_genes"])
Matched down signatures: ['ERBB2', 'SERPINE1', 'MET', 'TP53', 'PTEN', 'CHEK1', 'RET', 'PROX1', 'KRAS', 'BRAF', 'CD2'] Down genes for GSEA (from gene_sets): ['ERBB2', 'SERPINE1', 'MET', 'TP53', 'PTEN', 'CHEK1', 'RET', 'PROX1', 'KRAS', 'BRAF', 'CD2']
all_expr_genes = set(expr_matrix.columns)
gene_sets['up_genes'] = [g for g in gene_sets['up_genes'] if g in all_expr_genes]
gene_sets['down_genes'] = [g for g in gene_sets['down_genes'] if g in all_expr_genes]
print(f"Matched up genes in expr_matrix: {len(gene_sets['up_genes'])}")
print(f"Matched down genes in expr_matrix: {len(gene_sets['down_genes'])}")
Matched up genes in expr_matrix: 9 Matched down genes in expr_matrix: 11
pip install gseapy==1.1.4
Requirement already satisfied: gseapy==1.1.4 in /opt/miniforge/envs/drug_environment2/lib/python3.10/site-packages (1.1.4) Requirement already satisfied: numpy>=1.13.0 in /opt/miniforge/envs/drug_environment2/lib/python3.10/site-packages (from gseapy==1.1.4) (2.2.6) Requirement already satisfied: scipy in /opt/miniforge/envs/drug_environment2/lib/python3.10/site-packages (from gseapy==1.1.4) (1.15.2) Requirement already satisfied: pandas in /opt/miniforge/envs/drug_environment2/lib/python3.10/site-packages (from gseapy==1.1.4) (2.3.1) Requirement already satisfied: matplotlib>=2.2 in /opt/miniforge/envs/drug_environment2/lib/python3.10/site-packages (from gseapy==1.1.4) (3.10.3) Requirement already satisfied: requests in /opt/miniforge/envs/drug_environment2/lib/python3.10/site-packages (from gseapy==1.1.4) (2.32.4) Requirement already satisfied: contourpy>=1.0.1 in /opt/miniforge/envs/drug_environment2/lib/python3.10/site-packages (from matplotlib>=2.2->gseapy==1.1.4) (1.3.2) Requirement already satisfied: cycler>=0.10 in /opt/miniforge/envs/drug_environment2/lib/python3.10/site-packages (from matplotlib>=2.2->gseapy==1.1.4) (0.12.1) Requirement already satisfied: fonttools>=4.22.0 in /opt/miniforge/envs/drug_environment2/lib/python3.10/site-packages (from matplotlib>=2.2->gseapy==1.1.4) (4.59.0) Requirement already satisfied: kiwisolver>=1.3.1 in /opt/miniforge/envs/drug_environment2/lib/python3.10/site-packages (from matplotlib>=2.2->gseapy==1.1.4) (1.4.8) Requirement already satisfied: packaging>=20.0 in /opt/miniforge/envs/drug_environment2/lib/python3.10/site-packages (from matplotlib>=2.2->gseapy==1.1.4) (25.0) Requirement already satisfied: pillow>=8 in /opt/miniforge/envs/drug_environment2/lib/python3.10/site-packages (from matplotlib>=2.2->gseapy==1.1.4) (11.3.0) Requirement already satisfied: pyparsing>=2.3.1 in /opt/miniforge/envs/drug_environment2/lib/python3.10/site-packages (from matplotlib>=2.2->gseapy==1.1.4) (3.2.3) Requirement already satisfied: python-dateutil>=2.7 in /opt/miniforge/envs/drug_environment2/lib/python3.10/site-packages (from matplotlib>=2.2->gseapy==1.1.4) (2.9.0.post0) Requirement already satisfied: six>=1.5 in /opt/miniforge/envs/drug_environment2/lib/python3.10/site-packages (from python-dateutil>=2.7->matplotlib>=2.2->gseapy==1.1.4) (1.17.0) Requirement already satisfied: pytz>=2020.1 in /opt/miniforge/envs/drug_environment2/lib/python3.10/site-packages (from pandas->gseapy==1.1.4) (2025.2) Requirement already satisfied: tzdata>=2022.7 in /opt/miniforge/envs/drug_environment2/lib/python3.10/site-packages (from pandas->gseapy==1.1.4) (2025.2) Requirement already satisfied: charset_normalizer<4,>=2 in /opt/miniforge/envs/drug_environment2/lib/python3.10/site-packages (from requests->gseapy==1.1.4) (3.4.2) Requirement already satisfied: idna<4,>=2.5 in /opt/miniforge/envs/drug_environment2/lib/python3.10/site-packages (from requests->gseapy==1.1.4) (3.10) Requirement already satisfied: urllib3<3,>=1.21.1 in /opt/miniforge/envs/drug_environment2/lib/python3.10/site-packages (from requests->gseapy==1.1.4) (2.5.0) Requirement already satisfied: certifi>=2017.4.17 in /opt/miniforge/envs/drug_environment2/lib/python3.10/site-packages (from requests->gseapy==1.1.4) (2025.8.3) Note: you may need to restart the kernel to use updated packages.
expr_matrix = expr_matrix.astype(float)
expr_matrix = expr_matrix.loc[:, ~expr_matrix.columns.duplicated()]
Small sample size GSEA to see if everything is working
import gseapy
print("Using GSEApy version:", gseapy.__version__)
# If >1.1.4, strongly consider pip installing 1.1.4 and restarting
# 5. Try with a very small subset and 1 thread as a diagnostic
result = gp.ssgsea(
data=expr_matrix.T.iloc[:, :50], # Only the first 50 cells for debugging
gene_sets=gene_sets,
outdir="/tmp/gsea_test",
sample_norm=True,
no_plot=True,
min_size=1,
threads=1,
verbose=True
)
print(result.res2d.head())
2025-08-07 05:30:45,699 [INFO] Parsing data files for ssGSEA........................... 2025-08-07 05:30:45,710 [INFO] 0000 gene_sets have been filtered out when max_size=500 and min_size=1 2025-08-07 05:30:45,711 [INFO] 0002 gene_sets used for further statistical testing..... 2025-08-07 05:30:45,712 [INFO] Start to run ssGSEA...Might take a while................
Using GSEApy version: 1.1.4 Name Term ES NES 0 35 up_genes 146.943343 0.641584 1 38 up_genes 137.182908 0.598968 2 28 up_genes 121.428185 0.53018 3 30 up_genes 120.840964 0.527616 4 9 up_genes 118.627641 0.517952
GSEA
#takes around 15-20 minutes
import os
output_dir = "/home/alexandrarolya/Pablo/PirateShip_GSEA_Palbociclib"
os.makedirs(output_dir, exist_ok=True)
ssgsea_results = gp.ssgsea(
data=expr_matrix.T, # all genes x all cells
gene_sets=gene_sets,
outdir=output_dir,
sample_norm=True,
no_plot=True,
min_size=1,
threads=12, # increase threads if you have sufficient CPU
verbose=True
)
print("ssGSEA run completed!")
print(ssgsea_results.res2d.head())
2025-08-07 05:30:45,797 [INFO] Parsing data files for ssGSEA........................... 2025-08-07 05:31:04,450 [INFO] 0000 gene_sets have been filtered out when max_size=500 and min_size=1 2025-08-07 05:31:04,451 [INFO] 0002 gene_sets used for further statistical testing..... 2025-08-07 05:31:04,452 [INFO] Start to run ssGSEA...Might take a while................
ssGSEA run completed!
Name Term ES NES
0 428690 up_genes 198.214141 0.582906
1 696368 up_genes 197.724168 0.581465
2 448392 up_genes 195.40606 0.574648
3 475556 up_genes 193.858088 0.570096
4 439546 up_genes 191.273228 0.562494
Visualisation¶
# read result
ssgsea_df = pd.read_csv(f'{output_dir}/gseapy.gene_set.ssgsea.report.csv')
# Convert to wide format (pivot)
ssgsea_pivot = ssgsea_df.pivot_table(
index='Name', # Cell ID
columns='Term', # Gene set name
values='NES' # Use normalized enrichment score
)
print(ssgsea_pivot.head())
print(ssgsea_pivot.shape)
# Randomly sample cells for visualization
sampled_cells = ssgsea_pivot.sample(n=10000, random_state=0)
Term down_genes up_genes Name 0 0.257283 0.275761 1 0.139456 0.154005 2 0.091022 0.230936 3 0.194104 0.268131 4 0.277169 0.272813 (713121, 2)
# plot
plt.figure(figsize=(12, 8)),
g = sns.clustermap(
sampled_cells,
cmap="OrRd",
figsize=(12, 8),
method="average",
row_cluster=True,
col_cluster=False, # 禁用列树状图
# standard_scale=0# plot
)
plt.figure(figsize=(12, 8)),
g = sns.clustermap(
sampled_cells,
cmap="OrRd",
figsize=(12, 8),
method="average",
row_cluster=True,
col_cluster=False # 禁用列树状图
# standard_scale=0 # Uncomment if you want standard scaling
)
g.ax_heatmap.set_xlabel("Gene Set")
g.ax_heatmap.set_ylabel("Cell")
g.ax_row_dendrogram.set_visible(False)
g.ax_col_dendrogram.set_visible(False)
g.ax_heatmap.yaxis.tick_left()
g.ax_heatmap.yaxis.set_label_position("left")
g.cax.set_position([0.95, 0.18, 0.02, 0.5])
plt.show()
/opt/miniforge/envs/drug_environment2/lib/python3.10/site-packages/seaborn/matrix.py:560: UserWarning: Clustering large matrix with scipy. Installing `fastcluster` may give better performance. warnings.warn(msg) /opt/miniforge/envs/drug_environment2/lib/python3.10/site-packages/seaborn/matrix.py:560: UserWarning: Clustering large matrix with scipy. Installing `fastcluster` may give better performance. warnings.warn(msg)
<Figure size 1200x800 with 0 Axes>
<Figure size 1200x800 with 0 Axes>
Thoughts?
- some cells are predicted more sensitive, others more resistant lol
# Visualize on spatial data;
meta_df = pd.read_csv("/share/ulli/HumanBreastCancerPatient1_cell_metadata.csv")
meta_df = meta_df.rename(columns={"Unnamed: 0": "cell_id"})
print(meta_df.head())
print(meta_df.shape)
cell_id fov volume center_x center_y min_x \
0 128421 0 547.841519 10996.346206 -211.315228 10991.189362
1 128422 0 291.101510 11103.182359 -209.935477 11099.334599
2 128425 0 489.978098 11007.296582 -209.592749 11002.998782
3 128426 0 654.037717 11106.932047 -207.366062 11100.501938
4 128427 0 861.329942 11061.537618 -205.607113 11055.883233
max_x min_y max_y
0 11001.503050 -215.639293 -206.991164
1 11107.030118 -213.992957 -205.877997
2 11011.594381 -213.726745 -205.458754
3 11113.362156 -212.213345 -202.518779
4 11067.192004 -211.373308 -199.840918
(713121, 9)
# define thresholds for up and down enrichment scores
up_es = ssgsea_pivot["up_genes"]
up_threshold = up_es.quantile(0.85) # Top 15% as "sensitive"
down_threshold = up_es.quantile(0.15) # Bottom 15% as "resistant"
print(f"Up threshold: {up_threshold}, Down threshold: {down_threshold}")
up_es = ssgsea_pivot["up_genes"]
plt.figure(figsize=(8, 6))
sns.histplot(up_es, bins=50, color='blue', kde=True)
plt.axvline(x=down_threshold, color='red', linestyle='--')
plt.axvline(x=up_threshold, color='green', linestyle='--')
plt.show()
def assign_color(value, up_threshold=up_threshold, down_threshold=down_threshold):
"""
Assign colors based on enrichment scores.
Red for down-regulated(Resistant), green for up-regulated(Sensitive), gray for neutral.
"""
if pd.isna(value): # get rid of NaN values
return "#E0E0E0"
elif value > up_threshold:
return "#74C476"
elif value < down_threshold:
return "#D8594A"
else:
return "#E0E0E0"
meta_df["up_genes"] = meta_df["cell_id"].map(ssgsea_pivot["up_genes"])
meta_df["colors"] = meta_df["up_genes"].apply(assign_color)
print(meta_df.head())
Up threshold: 0.2883582600254959, Down threshold: -0.1718620939584133
cell_id fov volume center_x center_y min_x \
cell_id
0 128421 0 547.841519 10996.346206 -211.315228 10991.189362
1 128422 0 291.101510 11103.182359 -209.935477 11099.334599
2 128425 0 489.978098 11007.296582 -209.592749 11002.998782
3 128426 0 654.037717 11106.932047 -207.366062 11100.501938
4 128427 0 861.329942 11061.537618 -205.607113 11055.883233
max_x min_y max_y up_genes colors
cell_id
0 11001.503050 -215.639293 -206.991164 -0.165207 #E0E0E0
1 11107.030118 -213.992957 -205.877997 -0.165207 #E0E0E0
2 11011.594381 -213.726745 -205.458754 -0.178850 #D8594A
3 11113.362156 -212.213345 -202.518779 0.295357 #74C476
4 11067.192004 -211.373308 -199.840918 0.039660 #E0E0E0
Thoughts?
- Broad range of NES values, with a pronounced leftmost peak (around –0.2), a central population (near 0), and a long right tail peaking around +0.3 to +0.4
- shape could suggest significant heterogeneity in drug signature activity—some cells are much more “sensitive-like,” while a large population falls closer to “neutral” or “resistant-like.”
# Calculate cell center coordinates and assign a fixed radius
x = meta_df['center_x']
y = meta_df['center_y']
colors = meta_df['colors']
radius = 5 # Fixed radius for all cells
# Create a simplified cell spatial plot
plt.figure(figsize=(20, 20))
plt.scatter(x, y, s=radius**2, c=colors, edgecolors='none', alpha=0.8) # s is the marker area
# Set simplified plot properties
min_x, max_x = meta_df['min_x'].min(), meta_df['max_x'].max()
min_y, max_y = meta_df['min_y'].min(), meta_df['max_y'].max()
plt.axis('off') # Hide axes
plt.xlim(min_x - 10, max_x + 10) # Add margin to x-axis
plt.ylim(min_y - 10, max_y + 10) # Add margin to y-axis
plt.gca().set_aspect('equal', adjustable='box') # Keep aspect ratio equal
plt.tight_layout()
plt.show()
for_SpaRx = pd.DataFrame({
"cell_id": meta_df["cell_id"],
"up_genes": ssgsea_pivot["up_genes"]
})
print(for_SpaRx.head())
for_SpaRx.to_csv("/share/ulli/PirateShip_Palbociclib_for_SpaRx.csv", index=False)
cell_id up_genes 0 128421 0.275761 1 128422 0.154005 2 128425 0.230936 3 128426 0.268131 4 128427 0.272813
Visualisation of distribution of signature scores across all cells¶
import matplotlib.pyplot as plt
import seaborn as sns
plt.figure(figsize=(8, 4))
sns.histplot(palbo_cell_scores, bins=50, kde=True)
plt.title('Distribution of Palbociclib Signature Scores Across Cells')
plt.xlabel('Signature Expression Score')
plt.ylabel('Cell Count')
plt.show()
Majority of cells: Likely have low expression of palbociclib target genes -» likely to be less sensitive to the drug
Smaller subset: These cells have high expression of many palbociclib targets or strongly overexpress one or more, making them potentially more sensitive to palbociclib
Cell-cell communication analysis¶
- quintify how cells (reistant, sensitive, clisters, types) are spatially arranged
- identify spatially enriched signals or patters
Minimal requirements for analysis:
- gene expression amtrix of cells -» in my case: "expr_matrix", pandas DatFrame
- spatial coordinates for each cells (center_x, center_x in metadata) -» in my case: "meta_df"
- cell-level annotations (labels for clusters/types or predicted drug sensitivity/resistance) - in my case: cluster drug response information
Squidpy’s key applications:
- Build and analyze the neighborhood graph from spatial coordinates.
- Compute spatial statistics for cell-types and genes.
- Efficiently store, analyze and visualize large tissue images, leveraging skimage.
- Interactively explore anndata and large tissue images in napari.
Building the AnnData object¶
Some stuff i need to run because of errors...
print(meta_df.columns) # prints all column names
print(meta_df.index.name) # prints the name of the index column, if any
# Align metadata by existing index (since there is no 'cell_id' column)
meta_df_aligned = meta_df.loc[expr_matrix.index]
# Assign metadata to AnnData object
adata.obs = meta_df_aligned
# Assign spatial coordinates to AnnData.obsm
adata.obsm["spatial"] = adata.obs[["center_x", "center_y"]].to_numpy()
meta_df.index.name = 'cell_id'
Index(['cell_id', 'fov', 'volume', 'center_x', 'center_y', 'min_x', 'max_x',
'min_y', 'max_y', 'up_genes', 'colors'],
dtype='object')
None
/opt/miniforge/envs/drug_environment2/lib/python3.10/site-packages/anndata/_core/anndata.py:740: UserWarning:
AnnData expects .obs.index to contain strings, but got values like:
[0, 1, 2, 3, 4]
Inferred to be: integer
value_idx = self._prep_dim_index(value.index, attr)
import scanpy as sc
import squidpy as sq
# Step 1: Ensure indices are strings
expr_matrix.index = expr_matrix.index.astype(str)
meta_df.index = meta_df.index.astype(str)
# Step 2: Align metadata by existing index
meta_df_aligned = meta_df.loc[expr_matrix.index] # Order metadata to match expr_matrix
# Step 3: Create AnnData
adata = sc.AnnData(expr_matrix.astype(float))
adata.obs = meta_df_aligned
# Step 4: Assign spatial coordinates
adata.obsm["spatial"] = adata.obs[["center_x", "center_y"]].to_numpy()
# Step 5: Calculate or reuse your NES scores before classification
# (make sure up_es is aligned to adata.obs)
up_es = adata.obs["up_genes"] # or from your source aligned same way
# Step 6: Define the classify_response function (already defined by you)
def classify_response(score):
if pd.isna(score):
return "unknown"
elif score > up_threshold:
return "sensitive"
elif score < down_threshold:
return "resistant"
else:
return "neutral"
# Step 7: Assign new categorical column without overwriting or messing existing columns
adata.obs["drug_response_cat"] = up_es.apply(classify_response).astype("category")
print("✅ AnnData object successfully built with categorical drug_response!")
print(f"Shape: {adata.shape}")
print(f"Metadata columns: {list(adata.obs.columns)}")
print(f"Spatial coordinates shape: {adata.obsm['spatial'].shape}")
✅ AnnData object successfully built with categorical drug_response! Shape: (713121, 500) Metadata columns: ['cell_id', 'fov', 'volume', 'center_x', 'center_y', 'min_x', 'max_x', 'min_y', 'max_y', 'up_genes', 'colors', 'drug_response_cat'] Spatial coordinates shape: (713121, 2)
- 713,121 rows → 713,121 cells (or spatial spots), each with expression, metadata, and coordinates.
- 500 columns → expression matrix includes 500 gene features after filtering/processing.
adata.obscontains the associated metadata for each of those cells (likeup_genes,drug_response,center_x,center_y, etc.).adata.obsm"spatial"contains a 713121×2 NumPy array of spatial (x, y) coordinates.
adata = sc.AnnData(expr_matrix.astype(float))
adata.obs = meta_df_aligned
adata.obsm["spatial"] = adata.obs[["center_x", "center_y"]].to_numpy()
Creating a graph of spatial relationships between the cells¶
import squidpy as sq
sq.gr.spatial_neighbors(adata, coord_type="generic", n_neighs=6)
print("✅ Spatial neighbor graph built.")
✅ Spatial neighbor graph built.
For each cell in adata, Squidpy looked at the coordinates in adata.obsm"spatial", and found its 6 nearest neighbors using Euclidean distance.
- This graph is now stored in your AnnData object:
- The neighbor indices are in
adata.obsp'spatial_connectivities' - Neighbor distances are in
adata.obsp'spatial_distances'
Visualising the spatial neeighbor graph¶
def classify_response(score):
if pd.isna(score):
return "unknown"
elif score > up_threshold:
return "sensitive"
elif score < down_threshold:
return "resistant"
else:
return "neutral"
adata.obs['drug_response_cat'] = adata.obs['up_genes'].apply(classify_response).astype(str)
print(adata.obs['drug_response_cat'].value_counts(dropna=False))
print(adata.obs['drug_response_cat'].unique())
drug_response_cat neutral 499302 sensitive 106968 resistant 106851 Name: count, dtype: int64 ['neutral' 'resistant' 'sensitive']
#takes arount 1 minute
import matplotlib.pyplot as plt
import networkx as nx
connectivities = adata.obsp['spatial_connectivities']
G = nx.from_scipy_sparse_array(connectivities)
positions = {i: pos for i, pos in enumerate(adata.obsm["spatial"])}
plt.figure(figsize=(8, 8))
nx.draw_networkx_edges(G, positions, alpha=0.3, edge_color='gray')
# Use colors already assigned in your metadata
node_colors = adata.obs['colors'].tolist()
nx.draw_networkx_nodes(G, positions, node_size=10, node_color=node_colors)
plt.title("Spatial Neighbor Graph: Cells connected to 6 nearest neighbors")
plt.axis('off')
plt.show()
What i would like to do in further steps?
- analyse wheether drug-sensitive and resistant cells show spatial clustering or random distribution
- use ligand-receptor interaction inference with Squidpy to understand molecular signaling between drug response-defined neighborhoods
- identify if resistant and sensitive cells tend to spatially co-localuze or exclude each other
- try mapping GWAS results and other phenotypes alongside drug repsonse
Think about?
- Does neighborhood enrichment reveal distinct resistant/sensitive clusters?
- Which ligand-receptor pairs are enriched between these groups — what does it imply about signaling pathways or microenvironment effects?
- How do spatial patterns relate to known or hypothesized drug mechanisms from your signature and target mapping efforts?
import squidpy as sq
# Rebuild spatial neighbor graph with different neighbors number
sq.gr.spatial_neighbors(adata, coord_type="generic", n_neighs=15) # try 8, 10, 12, 15
# Ensure categorical dtype
adata.obs['drug_response_cat'] = adata.obs['drug_response_cat'].astype('category')
# Run neighborhood enrichment
sq.gr.nhood_enrichment(adata, cluster_key="drug_response_cat")
# Visualize with adjusted color scale and figure size
sq.pl.nhood_enrichment(
adata,
cluster_key="drug_response_cat",
vmin=-10, # Try increasing magnitude for more color range
vmax=7,
figsize=(8, 6)
)
100%|██████████| 1000/1000 [01:19<00:00, 12.63/s]
So this output suggests me whether cells from the same or different drug response groups (e.g., “sensitive”, “resistant”, “neutral”) tend to be spatial neighbors (are close in tissue), more often or less often than you’d expect by random chance. This is evidence for spatial self-segregation or clustering—e.g., resistant cells cluster with resistant, sensitive with sensitive, etc.
Each Square (Cell) in the Heatmap:
- Diagonal Squares (e.g., “sensitive”-“sensitive”) show if sensitive cells cluster together.
- Off-diagonal Squares (e.g., “sensitive”-“resistant”) show if sensitive and resistant cells are neighbors more (positive, yellow) or less (negative, blue) than expected.
Color Value (Z-score):
- Bright yellow/red (high positive): These two classes are found together as neighbors more than by chance (they cluster).
- Blue (negative): They are found as neighbors less than by chance (they avoid each other).
- Pale/neutral (~0): No special clustering or avoidance—random spatial organization.
Overlaying my predicted classes on a spatial scatter plot. If spatial domains are correct, I'll see “solid-color neighborhoods” for each class
#takes around 1 minute
import scanpy as sc
sc.pl.spatial(
adata,
color="drug_response_cat",
size=8, # marker size, can adjust if needed
spot_size=30, # essential to specify since no spatial image metadata
title="Spatial Map of Predicted Drug Response Classes",
legend_loc="right margin"
)
/tmp/ipykernel_62984/1433500138.py:3: FutureWarning: Use `squidpy.pl.spatial_scatter` instead. sc.pl.spatial(